BigQuery GoogleAnalytics SQL

Predict Visitor Purchases with a Classification Model in BQMLの記録用 BigQuery ML

Google Developers Japan: ML Study Jams vol.2 : Machine Learning 初心者・中級者向けトレーニングプログラムで、

クーポンもらったので、無料でできた。

自分なりの解釈をつけていく。

このセッションの目的は、あるユーザーの将来の購入見込み率を算定すること。

kaggleの competitionでも同じ目標(objective)だった

kaggleの場合は、当然もっと複雑だけど、、ここの考え方は参考になる。

時系列のモデル作成・評価は、windowサイズ、データの期間の設定が難しそう。

基本のCV算出

日別の形にするには？

code: cv.sql

#standardSQL

WITH visitors AS(

SELECT

date,

COUNT(DISTINCT fullVisitorId) AS total_visitors

FROM data-to-insights.ecommerce.web_analytics

group by date

purchasers AS(

SELECT

date,

COUNT(DISTINCT fullVisitorId) AS total_purchasers

FROM data-to-insights.ecommerce.web_analytics

WHERE totals.transactions IS NOT NULL

group by date

)

SELECT

visitors.date,

total_visitors,

total_purchasers,

round(total_purchasers / total_visitors*100 ,1) AS conversion_rate

FROM visitors

join purchasers on visitors.date = purchasers.date

order by 1

top5 selling products,

以下のSQLになってるけど、Google Analyticsでは、cart や detail段階でも、productQuantityは数字があるので、たぶん、間違ってると思う。以下のどちらかを条件付けするのがよいはず。

where p.localProductRevenue is not null

where h.eCommerceAction.action_type = 6

code: top5_products.sql

SELECT

p.v2ProductName,

p.v2ProductCategory,

SUM(p.productQuantity) AS units_sold,

ROUND(SUM(p.localProductRevenue/1000000),2) AS revenue

FROM data-to-insights.ecommerce.web_analytics,

UNNEST(hits) AS h,

UNNEST(h.product) AS p

GROUP BY 1, 2

ORDER BY revenue DESC

LIMIT 5;

再帰セッションで買ってるユーザーの特定

再帰で購入がある場合が1, 他は0とラベル作り

このあとで、ここに特徴量を追加して、trainging, testデータ・セットにする形

code: return_and_purchased.sql

# visitors who bought on a return visit (could have bought on first as well

WITH all_visitor_stats AS (

SELECT

fullvisitorid, # 741,721 unique visitors

IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit

FROM data-to-insights.ecommerce.web_analytics

GROUP BY fullvisitorid

)

SELECT

COUNT(DISTINCT fullvisitorid) AS total_visitors,

will_buy_on_return_visit

FROM all_visitor_stats

GROUP BY will_buy_on_return_visit

初回セッション時の bounce, timeOnSiteを特徴量にして、その後に購入があったかどうか？

code: create_dataset.sql

SELECT

* EXCEPT(fullVisitorId)

FROM

# features

(SELECT

fullVisitorId,

IFNULL(totals.bounces, 0) AS bounces,

IFNULL(totals.timeOnSite, 0) AS time_on_site

FROM

data-to-insights.ecommerce.web_analytics

WHERE

totals.newVisits = 1)

JOIN

(SELECT

fullvisitorid,

IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit

FROM

data-to-insights.ecommerce.web_analytics

GROUP BY fullvisitorid)

USING (fullVisitorId)

ORDER BY time_on_site DESC

LIMIT 10;

データセットの形ができたので、modelを作る. 特徴量は２つ

optionsで、model_type, labelsを指定　

as 以下で、label, 特徴量のデータを作る

trainの期間とevalの期間を分離する。predictの期間は..predictを何に使うかによるかな。

code: create_model.sql

CREATE OR REPLACE MODEL ecommerce.classification_model

OPTIONS

(

model_type='logistic_reg',

labels = 'will_buy_on_return_visit'

)

#standardSQL

SELECT

* EXCEPT(fullVisitorId)

FROM

# features

(SELECT

fullVisitorId,

IFNULL(totals.bounces, 0) AS bounces,

IFNULL(totals.timeOnSite, 0) AS time_on_site

FROM

data-to-insights.ecommerce.web_analytics

WHERE

totals.newVisits = 1

AND date BETWEEN '20160801' AND '20170430') # train on first 9 months

JOIN

(SELECT

fullvisitorid,

IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit

FROM

data-to-insights.ecommerce.web_analytics

GROUP BY fullvisitorid)

USING (fullVisitorId)

;

別の期間で評価する。

code: eval.sql

SELECT

roc_auc,

CASE

WHEN roc_auc > .9 THEN 'good'

WHEN roc_auc > .8 THEN 'fair'

WHEN roc_auc > .7 THEN 'decent'

WHEN roc_auc > .6 THEN 'not great'

ELSE 'poor' END AS model_quality

FROM

ML.EVALUATE(MODEL ecommerce.classification_model, (

SELECT

* EXCEPT(fullVisitorId)

FROM

# features

(SELECT

fullVisitorId,

IFNULL(totals.bounces, 0) AS bounces,

IFNULL(totals.timeOnSite, 0) AS time_on_site

FROM

data-to-insights.ecommerce.web_analytics

WHERE

totals.newVisits = 1

AND date BETWEEN '20170501' AND '20170630') # eval on 2 months

JOIN

(SELECT

fullvisitorid,

IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit

FROM

data-to-insights.ecommerce.web_analytics

GROUP BY fullvisitorid)

USING (fullVisitorId)

));

予測する。　

code: predict.sql

SELECT

FROM

ml.PREDICT(MODEL ecommerce.classification_model,

(

WITH all_visitor_stats AS (

SELECT

fullvisitorid,

IF(COUNTIF(totals.transactions > 0 AND totals.newVisits IS NULL) > 0, 1, 0) AS will_buy_on_return_visit

FROM data-to-insights.ecommerce.web_analytics

GROUP BY fullvisitorid

)

SELECT

CONCAT(fullvisitorid, '-',CAST(visitId AS STRING)) AS unique_session_id,

# labels

will_buy_on_return_visit,

# behavior on the site

IFNULL(totals.bounces, 0) AS bounces,

IFNULL(totals.timeOnSite, 0) AS time_on_site,

FROM data-to-insights.ecommerce.web_analytics,

JOIN all_visitor_stats USING(fullvisitorid)

WHERE

# only predict for new visits

totals.newVisits = 1

AND date BETWEEN '20170701' AND '20170801' # test 1 month

GROUP BY

unique_session_id,

will_buy_on_return_visit,

bounces,

time_on_site,

)

ORDER BY

predicted_will_buy_on_return_visit DESC;

これで、予測値から上位N%を見た場合に、どれくらいliftがあるかをみる。

実際のqwiklabsでは、もう少し特徴量を増やしたものをやる（そのセッションで ecommerce_actionがどのstepまでいったかどうか）